#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2018-08-21 12:05:12
# @Author  : wanglele (18911756410@163.com)
# @Link    : None
# @Version : "Python 3.7"

import os
import sys
from HTMLParser import HtmlParser
from time import sleep

from DataPut import DataSave
from HTMLDownloader import HtmlDownload
from URLManager import URLManager


class SpiderMan:
    def __init__(self):
        self.manager = URLManager()
        self.parser = HtmlParser()
        self.download = HtmlDownload()
        self.datasave = DataSave()

    def crawl(self, root_url):
        self.manager.add_new_url(root_url)

        while self.manager.has_new_url():
            sleep(0.5)  # 为防止被屏蔽，设定的休息时间。可以更合理点——随机时间
            try:
                new_url = self.manager.get_new_url()

                html = self.download.download(new_url)

                new_urls, data = self.parser.parser(new_url, html)

                self.manager.add_new_urls(new_urls)

                self.datasave.store_data(data)
                print('Crawl Suecced %s' % self.manager.old_urls_size())
            except Exception as e:
                print("crawl failed %s %s" % (new_url, e))
                # raise e

        self.datasave.dict_to_csv("python")


if __name__ == "__main__":
    # 入口链接
    root_url = "https://baike.baidu.com/item/%E9%99%88%E7%91%B6/16952725"
    spider_man = SpiderMan()
    spider_man.crawl(root_url)
